Read in files
and convert to wide format
#4CE long format Labs Data
patient_obs <- read_csv('penn-data/labs_long_thrombo_v2.csv') %>%
mutate(severity = (severe_ind == 1) %>%
as_factor() %>%
fct_recode('nonsevere' = 'FALSE', 'severe' = 'TRUE'))
#Code, Descriptions and Ranges
lab_mapping <- read_csv('public-data/loinc-map.csv')
load('public-data/lab.range.rda')
# load('public-data/code.dict.rda')
lab_bounds <- lab.range[, -1] %>%
colMeans() %>%
data.frame(value = .) %>%
rownames_to_column() %>%
separate(col = rowname, into = c('LOINC', 'bound'), sep = '_') %>%
mutate(LOINC = gsub('loinc', '', LOINC) %>% gsub('\\.', '-', .),
value = exp(value)) %>%
pivot_wider(names_from = 'bound', values_from = 'value') %>%
left_join(lab_mapping, by = 'LOINC') %>%
{.}
patient_obs_wide <- patient_obs %>%
left_join(lab_bounds, by = c('concept_code' = 'LOINC')) %>%
select(- concept_code) %>%
pivot_wider(id_cols = c(patient_num, days_since_admission, severity),
names_from = short_name,
values_from = value,
values_fn = mean)
#check NAs in the Wide format
na_stats <- patient_obs_wide %>%
select(- c(patient_num, days_since_admission, severity)) %>%
is.na() %>%
`!`
na_df <- data.frame(value_existed = colSums(na_stats),
prop_existed = colMeans(na_stats)) %>%
rownames_to_column('lab') %>%
mutate(prop_na = 1 - prop_existed,
lab = fct_reorder(lab, value_existed))
n_values <- na_df %>%
ggplot(aes(x = value_existed, y = lab)) +
geom_col() +
labs(x = 'Number of values', y = NULL)
na_prob <- na_df %>%
rename('Valid value' = prop_existed, 'NA' = prop_na) %>%
pivot_longer(c(`Valid value`, `NA`)) %>%
ggplot(aes(x = value, y = lab, fill = name)) +
geom_col() +
scale_fill_discrete(guide = guide_legend(reverse = TRUE)) +
labs(x = 'Proportion', y = NULL) +
# guides(fill = guide_legend(reverse = TRUE))
theme(
axis.text.y = element_blank(),
legend.key.width = unit(6, 'mm'),
legend.key.height = unit(4, 'mm'),
legend.position = 'bottom')
plot_grid(n_values, na_prob, nrow = 1, axis = 'b', align = 'h')

Analyze missingness and frequency of measures for each lab
patient_obs_long <- patient_obs_wide %>%
pivot_longer(-c(patient_num, days_since_admission, severity),
names_to = 'lab', values_to = 'value',
values_drop_na = TRUE)
per_lab <- patient_obs_long %>%
group_by(lab, patient_num, severity) %>%
count(name = 'n_obs') %>%
ungroup() %>%
group_by(lab, severity) %>%
count(n_obs) %>%
ungroup() %>%
pivot_wider(names_from = severity, values_from = n, values_fill = 0) %>%
mutate(both_severities = nonsevere + severe) %>%
mutate(prop_nonsevere = nonsevere/n_nonsevere,
prop_severe = severe/n_severe,
prop_both = both_severities/nrow(days_count_min_max))
lab_medians <-
patient_obs_long %>%
add_count(lab, name = 'total_obs') %>%
group_by(lab) %>%
mutate(total_patients = length(unique(patient_num))) %>%
add_count(patient_num, name = 'n_obs_patients') %>%
mutate(median_obs_per_patient = median(n_obs_patients),
n_greater0 = n_obs_patients > median_obs_per_patient,
n_greater1 = n_obs_patients > median_obs_per_patient + 1,
n_greater2 = n_obs_patients > median_obs_per_patient + 2) %>%
group_by(severity) %>%
mutate(each_med_obs_per_patient = median(n_obs_patients)) %>%
ungroup(severity) %>%
select(- c(days_since_admission, value)) %>%
distinct() %>%
group_by(lab) %>%
mutate(across(contains('n_greater'), sum)) %>%
select(- c(n_obs_patients, patient_num)) %>%
distinct() %>%
pivot_wider(names_from = severity, values_from = each_med_obs_per_patient) %>%
rename('median_obs_per_severe_patient' = severe,
'median_obs_per_non_severe_patient' = nonsevere) %>%
select(lab, total_obs, total_patients, starts_with('med'), starts_with('n_'))
lab_medians %>%
datatable(rownames = FALSE)
n_greater0 shows the number of patients who had more observations than the median. n_greater1 shows the number of patients who had more observations than the median + 1. n_greater2 shows the number of patients who had more observations than the median + 2.
In the figure below:
- Grey dash line:
Reference Low
- Grey solid line:
Reference High
- Black dash line:
lower bound outlier (QC)
- Black solid line:
upper bound outlier (QC)
patient_obs_long %>%
left_join(lab_bounds, by = c('lab' = 'short_name')) %>%
ggplot(aes(y = severity, x = value, fill = severity)) +
geom_violin() +
scale_fill_brewer(palette = 'Dark2', guide = guide_legend(reverse = TRUE)) +
labs(y = NULL, x = NULL) +
geom_vline(aes(xintercept = `Reference Low`), linetype = 'dashed', color = 'grey') +
geom_vline(aes(xintercept = `Reference High`), color = 'grey') +
geom_vline(aes(xintercept = LB), linetype = 'dashed') +
geom_vline(aes(xintercept = UB)) +
facet_wrap(~ lab, scales = 'free', ncol = 2, strip.position = 'left') +
theme(axis.text.y = element_blank())

Missing data heatmap
Capped at 20 days with observations.
per_lab %>%
select(lab, n_obs, both_severities, severe, nonsevere) %>%
filter(n_obs <= 20) %>%
pivot_longer(c(both_severities, severe, nonsevere)) %>%
mutate(name = name %>% fct_recode(
'All patients' = 'both_severities',
'Severe patients' = 'severe',
'Non-severe patients' = 'nonsevere'
)) %>%
ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) +
geom_tile(colour = "white", size = 0.2)+
geom_text(aes(label = value), colour = "white", size = 2) +
scale_y_discrete(expand = c(0, 0))+
scale_x_continuous(expand = c(0, 0),
breaks = c(1:max(per_lab$n_obs)),
labels = c(1:max(per_lab$n_obs)))+
scale_fill_gradient(low = "lightgrey", high = "darkblue",
limits = c(0, max(per_lab$both_severities))) +
facet_wrap(~ name, nrow = 1) +
labs(x = 'Number of values a patient has for each lab',
y = NULL, fill = '# patients') +
theme(panel.grid.major = element_blank(),
legend.position = c(0.93, 0.2),
axis.ticks.y = element_blank()
)

“Binned” heatmap: every 15 days
per_lab %>%
mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>%
group_by(lab, obs_bin) %>%
summarise(both_severities = sum(both_severities),
severe = sum(severe),
nonsevere = sum(nonsevere),
.groups = 'drop') %>%
select(lab, obs_bin, both_severities, severe, nonsevere) %>%
pivot_longer(c(both_severities, severe, nonsevere)) %>%
mutate(name = name %>% fct_recode(
'All patients' = 'both_severities',
'Severe patients' = 'severe',
'Non-severe patients' = 'nonsevere'
)) %>%
ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) +
geom_tile(colour = "white", size = 0.2) +
geom_text(aes(label = value), colour = "white", size = 2) +
scale_y_discrete(expand = c(0, 0))+
scale_fill_gradient(low = "lightgrey", high = "darkblue") +
facet_wrap(~ name, nrow = 1) +
labs(x = 'Number of values a patient has for each lab',
y = NULL, fill = '# patients') +
theme(panel.grid.major = element_blank(),
legend.position = c(0.93, 0.2),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.ticks.y = element_blank()
)


## [1] 15
per_lab %>%
select(lab, n_obs, prop_both, prop_severe, prop_nonsevere) %>%
filter(n_obs <= 20) %>%
pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>%
mutate(name = name %>% fct_recode(
'Compared to all patients' = 'prop_both',
'Compared to all severe patients' = 'prop_severe',
'Compared to all non-severe patients' = 'prop_nonsevere'
)) %>%
ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) +
geom_tile(colour = "white", size = 0.2)+
geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
scale_y_discrete(expand = c(0, 0)) +
scale_x_continuous(expand = c(0, 0),
breaks = c(1:max(per_lab$n_obs)),
labels = c(1:max(per_lab$n_obs)))+
scale_fill_gradient(low = "lightgrey", high = "darkblue",
labels = scales::percent_format(accuracy = 1L)) +
facet_wrap(~ name, nrow = 1) +
labs(x = 'Number of values a patient has for each lab',
y = NULL, fill = '% patients') +
theme(panel.grid.major = element_blank(),
legend.position = c(0.93, 0.2),
axis.ticks.y = element_blank()
)

per_lab %>%
mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>%
group_by(lab, obs_bin) %>%
summarise(prop_both = sum(prop_both),
prop_severe = sum(prop_severe),
prop_nonsevere = sum(prop_nonsevere),
.groups = 'drop') %>%
select(lab, obs_bin, prop_both, prop_severe, prop_nonsevere) %>%
pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>%
mutate(name = name %>% fct_recode(
'Compared to all patients' = 'prop_both',
'Compared to all severe patients' = 'prop_severe',
'Compared to all non-severe patients' = 'prop_nonsevere'
)) %>%
ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) +
geom_tile(colour = "white", size = 0.2) +
geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_gradient(low = "lightgrey", high = "darkblue",
labels = scales::percent_format(accuracy = 1L)) +
facet_wrap(~ name, nrow = 1) +
labs(x = 'Number of values a patient has for each lab',
y = NULL, fill = '% patients') +
theme(panel.grid.major = element_blank(),
legend.position = c(0.93, 0.2),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.ticks.y = element_blank()
)

Denominator: total number of patients, total number of non-severe patients, and total number of severe patients, respectively.
per_lab %>%
select(lab, n_obs, severe, nonsevere) %>%
filter(n_obs <= 90) %>%
pivot_longer(c(severe, nonsevere)) %>%
mutate(lab = fct_reorder(lab, n_obs),
name = name %>% fct_recode(
'Severe patients' = 'severe',
'Non-severe patients' = 'nonsevere'
)) %>%
ggplot(aes(x = n_obs, fill = name, y = value)) +
geom_col() +
scale_fill_brewer(palette = 'Dark2', direction = -1) +
facet_wrap(~ lab, scales = 'free') +
labs(x = 'Number of values a patient has for each lab',
y = NULL, fill = '# patients') +
theme(panel.grid.major = element_blank(),
legend.position = c(0.9, 0.2),
axis.ticks.y = element_blank()
)

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.1.0 tibble_3.0.4 DT_0.16 forcats_0.5.0 tidyr_1.1.2
## [6] dplyr_1.0.2 readr_1.4.0 ggplot2_3.3.2
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-2 pillar_1.4.6 compiler_4.0.3 tools_4.0.3
## [5] digest_0.6.27 jsonlite_1.7.1 evaluate_0.14 lifecycle_0.2.0
## [9] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.8 rstudioapi_0.13
## [13] cli_2.1.0 crosstalk_1.1.0.1 yaml_2.2.1 xfun_0.19
## [17] withr_2.3.0 stringr_1.4.0 knitr_1.30 generics_0.1.0
## [21] vctrs_0.3.4 htmlwidgets_1.5.2 hms_0.5.3 grid_4.0.3
## [25] tidyselect_1.1.0 glue_1.4.2 R6_2.5.0 fansi_0.4.1
## [29] rmarkdown_2.5 farver_2.0.3 purrr_0.3.4 magrittr_1.5
## [33] scales_1.1.1 ellipsis_0.3.1 htmltools_0.5.0 assertthat_0.2.1
## [37] colorspace_2.0-0 labeling_0.4.2 stringi_1.5.3 munsell_0.5.0
## [41] crayon_1.3.4
---
title: "Missingness analysis"
author: Amelia Tan, Arianna Dagliati, Trang Le
date: "27 October 2020"
---


```{r setup, warning=FALSE, message=FALSE}
library(ggplot2)
library(readr)
library(dplyr)
library(tidyr)
library(forcats)
library(DT)
library(tibble)
library(cowplot)

theme_set(theme_bw() + 
            theme(legend.title = element_blank(),
                  panel.grid.minor = element_blank()))
```

## Read in files 
and convert to wide format
```{R message=FALSE}
#4CE long format Labs Data
patient_obs <- read_csv('penn-data/labs_long_thrombo_v2.csv') %>% 
  mutate(severity = (severe_ind == 1) %>% 
           as_factor() %>% 
           fct_recode('nonsevere' = 'FALSE', 'severe' = 'TRUE'))

#Code, Descriptions and Ranges
lab_mapping <- read_csv('public-data/loinc-map.csv')
load('public-data/lab.range.rda')
# load('public-data/code.dict.rda')
```

```{r}
lab_bounds <- lab.range[, -1] %>% 
  colMeans() %>% 
  data.frame(value = .) %>% 
  rownames_to_column() %>% 
  separate(col = rowname, into = c('LOINC', 'bound'), sep = '_') %>% 
  mutate(LOINC = gsub('loinc', '', LOINC) %>% gsub('\\.', '-', .),
         value = exp(value)) %>% 
  pivot_wider(names_from = 'bound', values_from = 'value') %>%
  left_join(lab_mapping, by = 'LOINC') %>%
  {.}
```

```{R}
patient_obs_wide <- patient_obs %>% 
  left_join(lab_bounds, by = c('concept_code' = 'LOINC')) %>% 
  select(- concept_code) %>% 
  pivot_wider(id_cols = c(patient_num, days_since_admission, severity),
              names_from = short_name,
              values_from = value,
              values_fn = mean)

#check NAs in the Wide format
na_stats <- patient_obs_wide %>% 
  select(- c(patient_num, days_since_admission, severity)) %>% 
  is.na() %>% 
  `!` 

na_df <- data.frame(value_existed = colSums(na_stats), 
           prop_existed = colMeans(na_stats)) %>% 
  rownames_to_column('lab') %>% 
  mutate(prop_na = 1 - prop_existed,
         lab = fct_reorder(lab, value_existed))

n_values <- na_df %>% 
  ggplot(aes(x = value_existed, y = lab)) +
  geom_col() + 
  labs(x = 'Number of values', y = NULL)

na_prob <- na_df %>%
  rename('Valid value' = prop_existed, 'NA' = prop_na) %>% 
  pivot_longer(c(`Valid value`, `NA`)) %>% 
  ggplot(aes(x = value, y = lab, fill = name)) +
  geom_col() + 
  scale_fill_discrete(guide = guide_legend(reverse = TRUE)) +
  labs(x = 'Proportion', y = NULL) +
  # guides(fill = guide_legend(reverse = TRUE))
  theme(
    axis.text.y = element_blank(),
    legend.key.width = unit(6, 'mm'),
    legend.key.height = unit(4, 'mm'),
    legend.position = 'bottom')

plot_grid(n_values, na_prob, nrow = 1, axis = 'b', align = 'h')

penn_na_df <- na_df
```

## Number of observation (days) per patient
```{R}
days_count_min_max <- patient_obs_wide %>%
  group_by(patient_num, severity) %>%
  summarise(
    n_values = n_distinct(days_since_admission),
    min_day = min(days_since_admission),
    max_day = max(days_since_admission),
    .groups = 'drop'
  ) %>% 
  mutate(time_obs = max_day - min_day) %>% 
  select(-patient_num) %>% 
  add_count(severity, name = 'n_severity') 

agg_n_values <- days_count_min_max %>% 
  count(severity, n_severity, n_values,
        name = 'n_nvals')
agg_max_day <- days_count_min_max %>% 
  count(severity, n_severity, max_day,
        name = 'n_maxday')

(n_severe <- sum(days_count_min_max$severity == 'severe'))
(n_nonsevere <- sum(days_count_min_max$severity == 'nonsevere'))
# summary(days_count_min_max$n_values)
```

## Histogram of the number of days with at least one observation
```{R}
agg_n_values %>% 
  ggplot(aes(x = n_values, y = n_nvals, fill = severity)) +
  geom_col(alpha = 0.5) +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  labs(fill = 'Severe?', 
       x = "Number of days with data", 
       y = "Count")
```

## Histogram of length of stay
i.e. last day with observation-first day with observation

We need to check for readmission here.
```{R}
agg_max_day %>% 
  ggplot(aes(x = max_day, y = n_maxday, fill = severity)) +
  geom_col(alpha = 0.5) +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  labs(fill = 'Severe?', 
       x = "Number of days with data", 
       y = "Count")
```

## Analyze missingness and frequency of measures for each lab

```{r}
patient_obs_long <- patient_obs_wide %>% 
  pivot_longer(-c(patient_num, days_since_admission, severity),
               names_to = 'lab', values_to = 'value',
               values_drop_na = TRUE)
  
per_lab <- patient_obs_long %>% 
  group_by(lab, patient_num, severity) %>% 
  count(name = 'n_obs') %>% 
  ungroup() %>% 
  group_by(lab, severity) %>% 
  count(n_obs) %>% 
  ungroup() %>% 
  pivot_wider(names_from = severity, values_from = n, values_fill = 0) %>% 
  mutate(both_severities = nonsevere + severe) %>% 
  mutate(prop_nonsevere = nonsevere/n_nonsevere,
         prop_severe = severe/n_severe,
         prop_both = both_severities/nrow(days_count_min_max))

lab_medians <-
  patient_obs_long %>% 
  add_count(lab, name = 'total_obs') %>% 
  group_by(lab) %>% 
  mutate(total_patients = length(unique(patient_num))) %>% 
  add_count(patient_num, name = 'n_obs_patients') %>% 
  mutate(median_obs_per_patient = median(n_obs_patients),
         n_greater0 = n_obs_patients > median_obs_per_patient,
         n_greater1 = n_obs_patients > median_obs_per_patient + 1,
         n_greater2 = n_obs_patients > median_obs_per_patient + 2) %>% 
  group_by(severity) %>% 
  mutate(each_med_obs_per_patient = median(n_obs_patients)) %>% 
  ungroup(severity) %>% 
  select(- c(days_since_admission, value)) %>% 
  distinct() %>% 
  group_by(lab) %>% 
  mutate(across(contains('n_greater'), sum)) %>% 
  select(- c(n_obs_patients, patient_num)) %>% 
  distinct() %>% 
  pivot_wider(names_from = severity, values_from = each_med_obs_per_patient) %>% 
  rename('median_obs_per_severe_patient' = severe,
         'median_obs_per_non_severe_patient' = nonsevere) %>% 
  select(lab, total_obs, total_patients, starts_with('med'), starts_with('n_'))

lab_medians %>% 
  datatable(rownames = FALSE)
```


`n_greater0` shows the number of patients who had more observations than the median.
`n_greater1` shows the number of patients who had more observations than the median + 1.
`n_greater2` shows the number of patients who had more observations than the median + 2.

```{r eval=FALSE, include=FALSE}
lab_medians %>% 
  select(lab, total_obs, total_patients) %>%
  pivot_longer(- lab) %>% 
  ggplot(aes(x = value, y = fct_reorder(lab, value))) +
  geom_col() +
  facet_grid(cols = vars(name), scales = 'free_x', space = 'free') +
  labs(y = NULL)
```

In the figure below:

- Grey dash line: `Reference Low`
- Grey solid line: `Reference High`
- Black dash line: `lower bound outlier` (QC)
- Black solid line: `upper bound outlier` (QC)

```{r fig.width=9, fig.height=11}
patient_obs_long %>% 
  left_join(lab_bounds, by = c('lab' = 'short_name')) %>% 
  ggplot(aes(y = severity, x = value, fill = severity)) +
  geom_violin() +
  scale_fill_brewer(palette = 'Dark2', guide = guide_legend(reverse = TRUE)) +
  labs(y = NULL, x = NULL) +
  geom_vline(aes(xintercept = `Reference Low`), linetype = 'dashed', color = 'grey') +
  geom_vline(aes(xintercept = `Reference High`), color = 'grey') +
  geom_vline(aes(xintercept = LB), linetype = 'dashed') +
  geom_vline(aes(xintercept = UB)) +
  facet_wrap(~ lab, scales = 'free', ncol = 2, strip.position = 'left') +
  theme(axis.text.y = element_blank())
```

## Missing data heatmap

Capped at 20 days with observations.

```{r fig.width=12}
per_lab %>% 
  select(lab, n_obs, both_severities, severe, nonsevere) %>% 
  filter(n_obs <= 20) %>% 
  pivot_longer(c(both_severities, severe, nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'All patients' = 'both_severities',
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) + 
  geom_tile(colour = "white", size = 0.2)+
  geom_text(aes(label = value), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0))+
  scale_x_continuous(expand = c(0, 0),
                     breaks = c(1:max(per_lab$n_obs)),
                     labels = c(1:max(per_lab$n_obs)))+
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      limits = c(0, max(per_lab$both_severities))) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.ticks.y = element_blank()
  )
```

"Binned" heatmap: every 15 days

```{r fig.width=12}
per_lab %>%
  mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>% 
  group_by(lab, obs_bin) %>% 
  summarise(both_severities = sum(both_severities), 
            severe = sum(severe), 
            nonsevere = sum(nonsevere), 
            .groups = 'drop') %>% 
  select(lab, obs_bin, both_severities, severe, nonsevere) %>% 
  pivot_longer(c(both_severities, severe, nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'All patients' = 'both_severities',
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) + 
  geom_tile(colour = "white", size = 0.2) +
  geom_text(aes(label = value), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0))+
  scale_fill_gradient(low = "lightgrey", high = "darkblue") +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.ticks.y = element_blank()
  )
```

```{r}
boxplot(per_lab$n_obs)
sum(per_lab$n_obs > 90)
```

```{r fig.width=12}
per_lab %>% 
  select(lab, n_obs, prop_both, prop_severe, prop_nonsevere) %>% 
  filter(n_obs <= 20) %>% 
  pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'Compared to all patients' = 'prop_both',
    'Compared to all severe patients' = 'prop_severe',
    'Compared to all non-severe patients' = 'prop_nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = value, y = fct_reorder(lab, n_obs))) + 
  geom_tile(colour = "white", size = 0.2)+
  geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_x_continuous(expand = c(0, 0),
                     breaks = c(1:max(per_lab$n_obs)),
                     labels = c(1:max(per_lab$n_obs)))+ 
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      labels = scales::percent_format(accuracy = 1L)) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '% patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.ticks.y = element_blank()
  )
```

```{r fig.width=12}
per_lab %>%
  mutate(obs_bin = cut(n_obs, breaks = 15, include.lowest = TRUE)) %>% 
  group_by(lab, obs_bin) %>% 
  summarise(prop_both = sum(prop_both), 
            prop_severe = sum(prop_severe), 
            prop_nonsevere = sum(prop_nonsevere), 
            .groups = 'drop') %>% 
  select(lab, obs_bin, prop_both, prop_severe, prop_nonsevere) %>% 
  pivot_longer(c(prop_both, prop_severe, prop_nonsevere)) %>% 
  mutate(name = name %>% fct_recode(
    'Compared to all patients' = 'prop_both',
    'Compared to all severe patients' = 'prop_severe',
    'Compared to all non-severe patients' = 'prop_nonsevere'
  )) %>% 
  ggplot(aes(x = obs_bin, fill = value, y = fct_reorder(lab, value))) + 
  geom_tile(colour = "white", size = 0.2) +
  geom_text(aes(label = round(value, 2)), colour = "white", size = 2) +
  scale_y_discrete(expand = c(0, 0)) +
  scale_fill_gradient(low = "lightgrey", high = "darkblue",
                      labels = scales::percent_format(accuracy = 1L)) +
  facet_wrap(~ name, nrow = 1) +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '% patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.93, 0.2),
        axis.text.x = element_text(angle = 90, hjust = 1),
        axis.ticks.y = element_blank()
  )
```

Denominator: total number of patients, total number of non-severe patients,
and total number of severe patients, respectively.

```{r}
per_lab %>% 
  select(lab, n_obs, severe, nonsevere) %>% 
  filter(n_obs <= 90) %>%
  pivot_longer(c(severe, nonsevere)) %>% 
  mutate(lab = fct_reorder(lab, n_obs),
         name = name %>% fct_recode(
    'Severe patients' = 'severe',
    'Non-severe patients' = 'nonsevere'
  )) %>% 
  ggplot(aes(x = n_obs, fill = name, y = value)) + 
  geom_col() +
  scale_fill_brewer(palette = 'Dark2', direction = -1) +
  facet_wrap(~ lab, scales = 'free') +
  labs(x = 'Number of values a patient has for each lab',
       y = NULL, fill = '# patients') +
  theme(panel.grid.major = element_blank(),
        legend.position = c(0.9, 0.2),
        axis.ticks.y = element_blank()
  )
```


```{r}
sessionInfo()
```
